
---
title: "Choosing STEM degrees: STEM identity, science capital, and economic utility beliefs among young adults"
author: "Tendai Charles"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_float: true
    number_sections: true
    df_print: paged
  word_document: default
params:
  data_file: "aspires3_survey.dta"
  labels_file: "aspires3_variable_labels.csv"
  out_dir: "outputs_sem_controlled_revision"
  analysis_population: "undergrad"
  stem_definition: "core"
  use_sampling_weights: true
  weight_var: "weight"
  sem_estimator: "WLSMV"
  sem_missing: "pairwise"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 11,
  fig.height = 7,
  dpi = 150
)
options(stringsAsFactors = FALSE, scipen = 999)
```

# Overview

This R Markdown conducts an SEM on one wave of the ASPIRES 3 dataset.

1. It proposes a theoretical core—Science Capital, STEM Identity, and economic utility beliefs. It also treats these as part of a **controlled model** rather than a largely stand-alone descriptive specification.
2. It adds a **background-control block** covering gender, ethnicity, parental university participation, school type, area deprivation, and a composite measure of prior science preparation.
3. It estimates the SEMs using **WLSMV with pairwise handling of missing data**, `fixed.x = FALSE`, and `conditional.x = FALSE`, so that missingness on exogenous predictors does not trigger unnecessary listwise deletion.

Keep this `.Rmd` file in the same folder as:

- `aspires3_survey.dta`
- `aspires3_variable_labels.csv`

# 1. Package management

```{r packages}
required_pkgs <- c(
  "haven", "dplyr", "tidyr", "readr", "stringr", "purrr",
  "psych", "lavaan", "semTools", "semPlot", "qgraph",
  "pROC", "broom", "car", "tibble", "ggplot2", "knitr"
)

options(repos = c(CRAN = "https://cloud.r-project.org"))

to_install <- setdiff(required_pkgs, rownames(installed.packages()))
if (length(to_install) > 0) {
  install.packages(to_install, dependencies = TRUE)
}

invisible(lapply(required_pkgs, library, character.only = TRUE))
rm(required_pkgs, to_install)

set.seed(1234)
```

```{r package-versions}
session_info_tbl <- tibble::tibble(
  package = c("R", "lavaan", "semTools"),
  version = c(
    R.version.string,
    as.character(utils::packageVersion("lavaan")),
    as.character(utils::packageVersion("semTools"))
  )
)

session_info_tbl |>
  knitr::kable(caption = "Key package versions used for this run")
```

# 2. User settings

```{r settings}
data_file    <- params$data_file
labels_file  <- params$labels_file
out_dir      <- params$out_dir

analysis_population  <- params$analysis_population
stem_definition      <- params$stem_definition
use_sampling_weights <- params$use_sampling_weights
weight_var           <- params$weight_var
sem_estimator        <- params$sem_estimator
sem_missing          <- params$sem_missing

dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)

if (!file.exists(data_file)) {
  stop(
    paste0(
      "The data file '", data_file, "' was not found in the current working directory.\n",
      "Place this .Rmd file in the same folder as the ASPIRES 3 .dta file, or edit params$data_file."
    )
  )
}

labels_available <- file.exists(labels_file)
```

# 3. Helper functions

```{r helper-functions}
recode_user_missing <- function(x) {
  if (!is.numeric(x)) return(x)
  dplyr::na_if(
    dplyr::na_if(
      dplyr::na_if(
        dplyr::na_if(x, 996),
        997
      ),
      998
    ),
    999
  )
}

reverse_ordinal <- function(x, min_val, max_val, extra_missing = NULL) {
  if (!is.null(extra_missing)) {
    x[x %in% extra_missing] <- NA_real_
  }
  dplyr::if_else(is.na(x), NA_real_, (max_val + min_val) - x)
}

recode_yes_no <- function(x) {
  dplyr::case_when(
    x == 1 ~ 1,
    x == 2 ~ 0,
    TRUE   ~ NA_real_
  )
}

recode_visit_freq <- function(x) {
  dplyr::case_when(
    x == 1 ~ 1,
    x == 2 ~ 2,
    x == 3 ~ 3,
    x == 4 ~ 4,
    x == 5 ~ 0,
    TRUE   ~ NA_real_
  )
}

recode_qtalk <- function(x) {
  dplyr::case_when(
    x == 1 ~ 5,
    x == 2 ~ 4,
    x == 3 ~ 3,
    x == 4 ~ 2,
    x == 5 ~ 1,
    x == 6 ~ 0,
    TRUE   ~ NA_real_
  )
}

safe_scale <- function(x) {
  if (all(is.na(x))) {
    return(rep(NA_real_, length(x)))
  }

  mu <- mean(x, na.rm = TRUE)
  sig <- stats::sd(x, na.rm = TRUE)

  if (is.na(sig)) {
    return(rep(NA_real_, length(x)))
  }

  if (sig == 0) {
    return(dplyr::if_else(is.na(x), NA_real_, 0))
  }

  as.numeric((x - mu) / sig)
}

row_mean_min_n <- function(df, min_nonmiss = 1) {
  mat <- as.matrix(df)
  out <- rowMeans(mat, na.rm = TRUE)
  out[rowSums(!is.na(mat)) < min_nonmiss] <- NA_real_
  out[is.nan(out)] <- NA_real_
  out
}

row_any_yes <- function(df, yes_val = 1, min_nonmiss = 1) {
  mat <- as.matrix(df)
  nonmiss_n <- rowSums(!is.na(mat))
  any_yes <- rowSums(mat == yes_val, na.rm = TRUE) > 0
  out <- dplyr::if_else(nonmiss_n >= min_nonmiss, 0, NA_real_)
  out[any_yes] <- 1
  out
}

weighted_mean_na <- function(x, w) {
  keep <- !is.na(x) & !is.na(w)
  if (!any(keep)) return(NA_real_)
  sum(x[keep] * w[keep]) / sum(w[keep])
}

make_sample_compare <- function(data, include_var, vars, weight_var = "weight") {
  purrr::map_dfr(
    vars,
    function(v) {
      in_flag <- data[[include_var]] == 1
      out_flag <- data[[include_var]] == 0

      tibble::tibble(
        variable = v,
        overall = weighted_mean_na(data[[v]], data[[weight_var]]),
        included = weighted_mean_na(data[[v]][in_flag], data[[weight_var]][in_flag]),
        excluded = weighted_mean_na(data[[v]][out_flag], data[[weight_var]][out_flag]),
        abs_diff = abs(included - excluded),
        prop_missing = mean(is.na(data[[v]]))
      )
    }
  )
}

nagelkerke_r2 <- function(model) {
  mf <- stats::model.frame(model)
  y <- stats::model.response(mf)
  w <- stats::model.weights(mf)

  if (is.null(y)) {
    stop("Could not extract the response from the fitted model.")
  }

  null_df <- data.frame(.y = y)

  null_model <- if (is.null(w)) {
    stats::glm(.y ~ 1, family = stats::binomial(), data = null_df)
  } else {
    null_df$.w <- w
    stats::glm(.y ~ 1, family = stats::binomial(), data = null_df, weights = .w)
  }

  ll_full <- as.numeric(stats::logLik(model))
  ll_null <- as.numeric(stats::logLik(null_model))
  n <- stats::nobs(model)

  num <- 1 - exp((2 / n) * (ll_null - ll_full))
  den <- 1 - exp((2 / n) * ll_null)

  num / den
}

roc_from_logit <- function(model) {
  mf <- stats::model.frame(model)
  response <- stats::model.response(mf)
  predictor <- as.numeric(stats::fitted(model))

  if (is.factor(response) && nlevels(response) == 2) {
    response <- stats::relevel(response, ref = levels(response)[1])
  } else if (is.logical(response)) {
    response <- as.integer(response)
  }

  pROC::roc(response = response, predictor = predictor, quiet = TRUE)
}

extract_fit_table <- function(fit) {
  fm <- lavaan::fitMeasures(fit)
  wanted <- c(
    "chisq", "df", "pvalue", "cfi", "tli", "rmsea", "srmr",
    "cfi.robust", "tli.robust", "rmsea.robust"
  )
  wanted <- wanted[wanted %in% names(fm)]
  tibble::tibble(
    metric = wanted,
    value  = unname(fm[wanted])
  )
}

extract_fit_summary <- function(fit, model_name) {
  fm <- lavaan::fitMeasures(fit)
  tibble::tibble(
    model = model_name,
    n = unname(lavaan::inspect(fit, "nobs")),
    chisq = if ("chisq" %in% names(fm)) unname(fm[["chisq"]]) else NA_real_,
    df = if ("df" %in% names(fm)) unname(fm[["df"]]) else NA_real_,
    cfi = if ("cfi" %in% names(fm)) unname(fm[["cfi"]]) else NA_real_,
    tli = if ("tli" %in% names(fm)) unname(fm[["tli"]]) else NA_real_,
    rmsea = if ("rmsea" %in% names(fm)) unname(fm[["rmsea"]]) else NA_real_,
    srmr = if ("srmr" %in% names(fm)) unname(fm[["srmr"]]) else NA_real_,
    cfi_robust = if ("cfi.robust" %in% names(fm)) unname(fm[["cfi.robust"]]) else NA_real_,
    tli_robust = if ("tli.robust" %in% names(fm)) unname(fm[["tli.robust"]]) else NA_real_,
    rmsea_robust = if ("rmsea.robust" %in% names(fm)) unname(fm[["rmsea.robust"]]) else NA_real_
  )
}

extract_std_loadings <- function(fit) {
  lavaan::parameterEstimates(fit, standardized = TRUE) |>
    dplyr::filter(op == "=~") |>
    dplyr::transmute(
      construct   = lhs,
      indicator   = rhs,
      est         = est,
      se          = se,
      z           = z,
      p           = pvalue,
      std_loading = std.all
    ) |>
    dplyr::arrange(construct, dplyr::desc(std_loading))
}

extract_std_paths <- function(fit, include_cov = FALSE) {
  pe <- lavaan::parameterEstimates(fit, standardized = TRUE)

  keep_tbl <- pe |>
    dplyr::filter(op %in% c("~", ":="))

  if (isTRUE(include_cov)) {
    cov_tbl <- pe |>
      dplyr::filter(op == "~~", lhs != rhs)
    keep_tbl <- dplyr::bind_rows(keep_tbl, cov_tbl)
  }

  keep_tbl |>
    dplyr::transmute(
      lhs, op, rhs,
      est, se, z,
      p = pvalue,
      std = std.all
    )
}

safe_reliability_tbl <- function(fit) {
  tryCatch(
    {
      semTools::reliability(fit) |>
        as.data.frame() |>
        tibble::rownames_to_column("construct")
    },
    error = function(e) {
      tibble::tibble(
        construct = "reliability_not_available",
        message = e$message
      )
    }
  )
}

safe_htmt_tbl <- function(model_syntax, data, ordered = NULL) {
  tryCatch(
    {
      semTools::htmt(
        model = model_syntax,
        data = data,
        ordered = ordered,
        absolute = TRUE,
        htmt2 = TRUE
      ) |>
        as.data.frame() |>
        tibble::rownames_to_column("construct")
    },
    error = function(e) {
      tibble::tibble(
        construct = "htmt_not_available",
        message = e$message
      )
    }
  )
}

or_ci_table <- function(model) {
  est <- stats::coef(model)
  se  <- sqrt(diag(stats::vcov(model)))

  tibble::tibble(
    term      = names(est),
    estimate  = est,
    std.error = se,
    z         = est / se,
    p         = 2 * stats::pnorm(abs(est / se), lower.tail = FALSE),
    OR        = exp(est),
    CI_low    = exp(est - 1.96 * se),
    CI_high   = exp(est + 1.96 * se)
  )
}

fit_logit <- function(formula, data, use_weights = TRUE, weight_var = "weight") {
  if (!is.data.frame(data)) {
    stop("`data` must be a data.frame.")
  }

  if (isTRUE(use_weights)) {
    if (!is.character(weight_var) || length(weight_var) != 1) {
      stop("`weight_var` must be a single character string naming the weight column.")
    }

    if (!weight_var %in% names(data)) {
      stop(paste0("Weight column '", weight_var, "' was not found in `data`."))
    }

    data_local <- data |>
      dplyr::mutate(.model_weight = .data[[weight_var]])

    stats::glm(
      formula = formula,
      data = data_local,
      family = stats::binomial(),
      weights = .model_weight,
      na.action = stats::na.omit
    )
  } else {
    stats::glm(
      formula = formula,
      data = data,
      family = stats::binomial(),
      na.action = stats::na.omit
    )
  }
}

build_analysis_sample <- function(df,
                                  population = c("undergrad", "current_undergrad", "full_sample"),
                                  stem_def   = c("core", "broad")) {
  population <- match.arg(population)
  stem_def   <- match.arg(stem_def)

  dat <- df

  if (population == "undergrad") {
    dat <- dat |>
      dplyr::filter(level %in% c(1, 2))
  } else if (population == "current_undergrad") {
    dat <- dat |>
      dplyr::filter(level == 1)
  }

  if (stem_def == "core") {
    dat <- dat |>
      dplyr::filter(stem_deg %in% c(0, 2, 3, 4, 5, 6, 7, 8)) |>
      dplyr::mutate(stem_outcome = dplyr::if_else(stem_deg == 0, 0L, 1L))
  } else if (stem_def == "broad") {
    dat <- dat |>
      dplyr::filter(stem_deg %in% c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) |>
      dplyr::mutate(stem_outcome = dplyr::if_else(stem_deg == 0, 0L, 1L))
  }

  dat
}

write_csv_safely <- function(x, file) {
  readr::write_csv(x, file, na = "")
  invisible(file)
}

build_sem_layout <- function(node_names) {
  if ("ScienceCapitalLat" %in% node_names) {
    coord_map <- list(
      science_visit_index_z = c(0.06, 0.90),
      science_network_count_z = c(0.06, 0.78),
      qinformed_r = c(0.06, 0.66),
      qtalk_r = c(0.06, 0.54),
      scicap2_r = c(0.06, 0.42),
      ScienceCapitalLat = c(0.28, 0.66),
      fjob12_z = c(0.28, 0.90),
      scigen_z = c(0.28, 0.80),
      epr5_z = c(0.28, 0.70),
      epr13_r = c(0.48, 0.52),
      sciid2_r = c(0.48, 0.38),
      sciid1_r = c(0.48, 0.24),
      gender_woman = c(0.08, 0.22),
      gender_diverse = c(0.08, 0.10),
      eth_minoritised = c(0.22, 0.22),
      parent_uni_any = c(0.22, 0.10),
      school_selective = c(0.36, 0.22),
      school_private = c(0.36, 0.10),
      imd_quintile_z = c(0.50, 0.22),
      prior_science_prep_z = c(0.50, 0.10),
      STEMIdentity = c(0.74, 0.36),
      stem_outcome = c(0.93, 0.60)
    )
  } else {
    coord_map <- list(
      fjob12_z = c(0.08, 0.90),
      scigen_z = c(0.08, 0.78),
      epr5_z = c(0.08, 0.66),
      scicap_comp_z = c(0.08, 0.54),
      view2_z = c(0.08, 0.42),
      epr13_r = c(0.34, 0.52),
      sciid2_r = c(0.34, 0.38),
      sciid1_r = c(0.34, 0.24),
      gender_woman = c(0.08, 0.22),
      gender_diverse = c(0.08, 0.10),
      eth_minoritised = c(0.22, 0.22),
      parent_uni_any = c(0.22, 0.10),
      school_selective = c(0.46, 0.22),
      school_private = c(0.46, 0.10),
      imd_quintile_z = c(0.60, 0.22),
      prior_science_prep_z = c(0.60, 0.10),
      STEMIdentity = c(0.74, 0.36),
      stem_outcome = c(0.93, 0.60)
    )
  }

  layout_mat <- matrix(NA_real_, nrow = length(node_names), ncol = 2)
  rownames(layout_mat) <- node_names
  colnames(layout_mat) <- c("x", "y")

  known_nodes <- intersect(node_names, names(coord_map))
  for (nm in known_nodes) {
    layout_mat[nm, ] <- coord_map[[nm]]
  }

  missing_nodes <- setdiff(node_names, known_nodes)
  if (length(missing_nodes) > 0) {
    fallback_y <- seq(0.10, 0.90, length.out = length(missing_nodes))
    for (i in seq_along(missing_nodes)) {
      layout_mat[missing_nodes[i], ] <- c(0.15, fallback_y[i])
    }
  }

  layout_mat
}

make_smartpls_semplot <- function(fit, file_png, title_text = NULL) {
  tryCatch(
    {
      spm <- semPlot::semPlotModel(fit)
      node_names <- spm@Vars$name
      layout_mat <- build_sem_layout(node_names)
      r2_vals <- tryCatch(lavaan::inspect(fit, "r2"), error = function(e) NULL)

      nice_labels <- c(
        scicap2_r = "Scientific\nevidence",
        qtalk_r = "Talks about\nscience",
        qinformed_r = "Feels informed\nabout science",
        science_network_count_z = "STEM network\nbreadth",
        science_visit_index_z = "Informal science\nexposure",
        scicap_comp_z = "Science capital\ncomposite",
        sciid1_r = "I am a\nscience person",
        sciid2_r = "Others see me\nas science person",
        epr13_r = "I am good\nat science",
        epr5_z = "Science qual. ->\njob utility",
        scigen_z = "Science/tech ->\nfuture jobs",
        fjob12_z = "Income\norientation",
        view2_z = "Scientists earn\na lot",
        gender_woman = "Woman\n(ref. man)",
        gender_diverse = "Gender diverse /\nprefer not",
        eth_minoritised = "Minoritised\nethnicity",
        parent_uni_any = "Parent(s)\nwent to uni",
        school_selective = "Selective\nschool",
        school_private = "Private\nschool",
        imd_quintile_z = "IMD quintile\n(z)",
        prior_science_prep_z = "Prior science\npreparation",
        STEMIdentity = "STEM\nIdentity",
        ScienceCapitalLat = "Science\nCapital",
        stem_outcome = "Core STEM\nundergrad"
      )

      node_labels <- ifelse(
        node_names %in% names(nice_labels),
        unname(nice_labels[node_names]),
        node_names
      )

      if (!is.null(r2_vals)) {
        if ("ScienceCapitalLat" %in% names(r2_vals)) {
          node_labels[node_names == "ScienceCapitalLat"] <-
            sprintf("Science\nCapital\nR² = %.2f", r2_vals[["ScienceCapitalLat"]])
        }
        if ("STEMIdentity" %in% names(r2_vals)) {
          node_labels[node_names == "STEMIdentity"] <-
            sprintf("STEM\nIdentity\nR² = %.2f", r2_vals[["STEMIdentity"]])
        }
        if ("stem_outcome" %in% names(r2_vals)) {
          node_labels[node_names == "stem_outcome"] <-
            sprintf("Core STEM\nundergrad\nR² = %.2f", r2_vals[["stem_outcome"]])
        }
      }

      grDevices::png(file_png, width = 3400, height = 1900, res = 250, bg = "white")
      semPlot::semPaths(
        object = fit,
        what = "std",
        whatLabels = "std",
        style = "lisrel",
        layout = layout_mat,
        reorder = FALSE,
        nodeLabels = node_labels,
        residuals = FALSE,
        intercepts = FALSE,
        thresholds = FALSE,
        exoCov = FALSE,
        fade = FALSE,
        nCharNodes = 0,
        sizeMan = 9,
        sizeLat = 15,
        edge.label.cex = 0.85,
        mar = c(3, 3, 3, 3),
        title = FALSE,
        shapeMan = "rectangle",
        shapeLat = "ellipse",
        color = list(lat = "#2F80ED", man = "#FFF176"),
        edge.color = "black",
        posCol = c("black", "black"),
        negCol = c("black", "black")
      )
      if (!is.null(title_text)) {
        graphics::mtext(title_text, side = 3, line = 0.2, cex = 1.0, font = 2)
      }
      grDevices::dev.off()
    },
    error = function(e) {
      grDevices::png(file_png, width = 2400, height = 1500, res = 220)
      graphics::plot.new()
      graphics::text(0.5, 0.55, "SEM figure could not be rendered", cex = 1.2)
      graphics::text(0.5, 0.45, e$message, cex = 0.85)
      grDevices::dev.off()
    }
  )
}


label_box_dimensions <- function(label, cex = 1, font = 1) {
  lines <- unlist(strsplit(label, "\n", fixed = TRUE))
  width <- max(graphics::strwidth(lines, cex = cex, font = font, units = "user"))
  line_height <- graphics::strheight("Ag", cex = cex, font = font, units = "user")
  height <- max(1, length(lines)) * line_height * 1.15
  list(width = width, height = height)
}

draw_text_with_background <- function(x, y, label, cex = 1, font = 1,
                                      text_col = "black", bg = "white",
                                      pad_x = 0.008, pad_y = 0.006) {
  if (is.null(label) || is.na(label) || !nzchar(label)) {
    return(invisible(NULL))
  }

  dims <- label_box_dimensions(label, cex = cex, font = font)
  graphics::rect(
    xleft = x - dims$width / 2 - pad_x,
    ybottom = y - dims$height / 2 - pad_y,
    xright = x + dims$width / 2 + pad_x,
    ytop = y + dims$height / 2 + pad_y,
    col = bg,
    border = NA,
    xpd = NA
  )
  graphics::text(x, y, labels = label, cex = cex, font = font, col = text_col, xpd = NA)
}

draw_rect_node <- function(cx, cy, width, height, label,
                           fill = "#E8DD87", border = "black",
                           lwd = 1.5, cex = 1, font = 1) {
  graphics::rect(
    xleft = cx - width / 2,
    ybottom = cy - height / 2,
    xright = cx + width / 2,
    ytop = cy + height / 2,
    col = fill,
    border = border,
    lwd = lwd,
    xpd = NA
  )
  graphics::text(cx, cy, labels = label, cex = cex, font = font, xpd = NA)
}

draw_ellipse_node <- function(cx, cy, rx, ry, label,
                              fill = "#DCEAF7", border = "black",
                              lwd = 1.6, cex = 1.35, font = 2) {
  theta <- seq(0, 2 * pi, length.out = 300)
  x <- cx + rx * cos(theta)
  y <- cy + ry * sin(theta)

  graphics::polygon(
    x = x,
    y = y,
    col = fill,
    border = border,
    lwd = lwd,
    xpd = NA
  )
  graphics::text(cx, cy, labels = label, cex = cex, font = font, xpd = NA)
}

extract_standardized_path <- function(fit, lhs, op, rhs) {
  ss <- lavaan::standardizedSolution(fit)
  hit <- ss[ss$lhs == lhs & ss$op == op & ss$rhs == rhs, , drop = FALSE]

  if (nrow(hit) == 0) {
    return(list(std = NA_real_, p = NA_real_))
  }

  list(std = hit$est.std[1], p = hit$pvalue[1])
}

edge_style_from_estimate <- function(std, p = NA_real_, measurement = FALSE) {
  std_abs <- ifelse(is.na(std), 0, abs(std))

  if (measurement) {
    return(list(
      col = "black",
      lty = 1,
      lwd = 1.6 + 2.4 * std_abs
    ))
  }

  is_sig <- !is.na(p) && p < 0.05

  list(
    col = if (is_sig) "black" else "grey55",
    lty = if (is_sig) 1 else 2,
    lwd = 1.3 + 4.0 * std_abs
  )
}

draw_sem_edge <- function(x0, y0, x1, y1, label = NULL,
                          label_x = NULL, label_y = NULL,
                          style = list(col = "black", lty = 1, lwd = 2),
                          label_cex = 1.0) {
  graphics::arrows(
    x0 = x0, y0 = y0,
    x1 = x1, y1 = y1,
    length = 0.08,
    angle = 20,
    code = 2,
    lwd = style$lwd,
    col = style$col,
    lty = style$lty,
    xpd = NA
  )

  if (!is.null(label) && !is.na(label) && nzchar(label)) {
    draw_text_with_background(
      x = if (is.null(label_x)) (x0 + x1) / 2 else label_x,
      y = if (is.null(label_y)) (y0 + y1) / 2 else label_y,
      label = label,
      cex = label_cex
    )
  }
}

fmt_std <- function(x) {
  if (is.null(x) || length(x) == 0 || is.na(x)) {
    return("")
  }
  sprintf("%.2f", x)
}

make_clean_primary_semplot <- function(fit, file_png, title_text = "Controlled primary SEM") {
  tryCatch(
    {
      r2_vals <- tryCatch(lavaan::inspect(fit, "r2"), error = function(e) NULL)

      r2_identity <- if (!is.null(r2_vals) && "STEMIdentity" %in% names(r2_vals)) {
        as.numeric(r2_vals[["STEMIdentity"]])
      } else {
        NA_real_
      }

      r2_outcome <- if (!is.null(r2_vals) && "stem_outcome" %in% names(r2_vals)) {
        as.numeric(r2_vals[["stem_outcome"]])
      } else {
        NA_real_
      }

      latent_cx <- 0.50
      latent_cy <- 0.54
      latent_rx <- 0.13
      latent_ry <- 0.10

      pred_w <- 0.20
      pred_h <- 0.08
      ind_w <- 0.18
      ind_h <- 0.07
      out_w <- 0.18
      out_h <- 0.10

      predictor_nodes <- tibble::tibble(
        var = c("fjob12_z", "scigen_z", "epr5_z", "scicap_comp_z"),
        label = c(
          "Income orientation",
          "Science / technology =\nfuture jobs",
          "Science qualifications =\nmany jobs",
          "Science capital\ncomposite"
        ),
        x = c(0.17, 0.17, 0.17, 0.17),
        y = c(0.80, 0.64, 0.49, 0.34),
        width = pred_w,
        height = pred_h,
        fill = c("white", "white", "white", "white")
      )

      indicator_nodes <- tibble::tibble(
        var = c("sciid1_r", "sciid2_r", "epr13_r"),
        label = c(
          "I am a\nscience person",
          "Others see me as a\nscience person",
          "I am good at\nscience"
        ),
        x = c(0.30, 0.50, 0.70),
        y = c(0.18, 0.18, 0.18),
        width = ind_w,
        height = ind_h,
        fill = c("#F2F2F2", "#F2F2F2", "#F2F2F2")
      )

      outcome_x <- 0.83
      outcome_y <- latent_cy

      identity_path_specs <- tibble::tibble(
        rhs = c("scigen_z", "epr5_z", "scicap_comp_z"),
        x0 = c(0.17 + pred_w / 2, 0.17 + pred_w / 2, 0.17 + pred_w / 2),
        y0 = c(0.64, 0.49, 0.34),
        x1 = c(latent_cx - latent_rx * 0.92, latent_cx - latent_rx * 0.95, latent_cx - latent_rx * 0.90),
        y1 = c(0.62, 0.54, 0.46),
        label_x = c(0.33, 0.33, 0.33),
        label_y = c(0.63, 0.54, 0.41)
      )

      income_to_outcome_spec <- tibble::tibble(
        rhs = "fjob12_z",
        x0 = 0.17 + pred_w / 2,
        y0 = 0.80,
        x1 = outcome_x - out_w / 2,
        y1 = 0.63,
        label_x = 0.61,
        label_y = 0.72
      )

      identity_to_outcome_spec <- tibble::tibble(
        rhs = "STEMIdentity",
        x0 = latent_cx + latent_rx,
        y0 = latent_cy,
        x1 = outcome_x - out_w / 2,
        y1 = outcome_y,
        label_x = 0.67,
        label_y = 0.57
      )

      measurement_specs <- tibble::tibble(
        rhs = c("sciid1_r", "sciid2_r", "epr13_r"),
        x0 = c(0.43, 0.50, 0.57),
        y0 = c(0.45, 0.43, 0.45),
        x1 = c(0.30, 0.50, 0.70),
        y1 = c(0.18 + ind_h / 2, 0.18 + ind_h / 2, 0.18 + ind_h / 2),
        label_x = c(0.38, 0.50, 0.62),
        label_y = c(0.34, 0.31, 0.34)
      )

      grDevices::png(file_png, width = 3400, height = 2200, res = 250, bg = "white")
      old_par <- graphics::par(no.readonly = TRUE)

      graphics::par(mar = c(1.5, 1.5, 1.6, 1.5), xpd = NA)
      graphics::plot.new()
      graphics::plot.window(xlim = c(0, 1), ylim = c(0, 1), xaxs = "i", yaxs = "i")

      # Draw structural boxes
      for (i in seq_len(nrow(predictor_nodes))) {
        draw_rect_node(
          cx = predictor_nodes$x[i],
          cy = predictor_nodes$y[i],
          width = predictor_nodes$width[i],
          height = predictor_nodes$height[i],
          label = predictor_nodes$label[i],
          fill = predictor_nodes$fill[i],
          border = "black",
          lwd = 1.4,
          cex = 0.95,
          font = 1
        )
      }

      draw_rect_node(
        cx = outcome_x,
        cy = outcome_y,
        width = out_w,
        height = out_h,
        label = if (!is.na(r2_outcome)) {
          sprintf("Core STEM\ndegree\nR² = %.2f", r2_outcome)
        } else {
          "Core STEM\ndegree"
        },
        fill = "white",
        border = "black",
        lwd = 1.5,
        cex = 1.10,
        font = 2
      )

      for (i in seq_len(nrow(indicator_nodes))) {
        draw_rect_node(
          cx = indicator_nodes$x[i],
          cy = indicator_nodes$y[i],
          width = indicator_nodes$width[i],
          height = indicator_nodes$height[i],
          label = indicator_nodes$label[i],
          fill = indicator_nodes$fill[i],
          border = "black",
          lwd = 1.2,
          cex = 0.88,
          font = 1
        )
      }

      draw_ellipse_node(
        cx = latent_cx,
        cy = latent_cy,
        rx = latent_rx,
        ry = latent_ry,
        label = if (!is.na(r2_identity)) {
          sprintf("STEM\nidentity\nR² = %.2f", r2_identity)
        } else {
          "STEM\nidentity"
        },
        fill = "#DCEAF7",
        border = "black",
        lwd = 1.6,
        cex = 1.40,
        font = 2
      )

      # Draw focal predictor -> identity paths (significant paths only)
      for (i in seq_len(nrow(identity_path_specs))) {
        est <- extract_standardized_path(fit, "STEMIdentity", "~", identity_path_specs$rhs[i])
        if (!is.na(est$p) && est$p < 0.05) {
          draw_sem_edge(
            x0 = identity_path_specs$x0[i],
            y0 = identity_path_specs$y0[i],
            x1 = identity_path_specs$x1[i],
            y1 = identity_path_specs$y1[i],
            label = fmt_std(est$std),
            label_x = identity_path_specs$label_x[i],
            label_y = identity_path_specs$label_y[i],
            style = list(col = "black", lty = 1, lwd = 1.6 + 2.0 * abs(est$std)),
            label_cex = 0.95
          )
        }
      }

      # Draw income orientation -> outcome path (significant direct path only)
      income_est <- extract_standardized_path(fit, "stem_outcome", "~", income_to_outcome_spec$rhs)
      if (!is.na(income_est$p) && income_est$p < 0.05) {
        draw_sem_edge(
          x0 = income_to_outcome_spec$x0,
          y0 = income_to_outcome_spec$y0,
          x1 = income_to_outcome_spec$x1,
          y1 = income_to_outcome_spec$y1,
          label = fmt_std(income_est$std),
          label_x = income_to_outcome_spec$label_x,
          label_y = income_to_outcome_spec$label_y,
          style = list(col = "black", lty = 1, lwd = 1.6 + 2.0 * abs(income_est$std)),
          label_cex = 0.95
        )
      }

      # Draw latent -> outcome path
      identity_to_outcome_est <- extract_standardized_path(fit, "stem_outcome", "~", identity_to_outcome_spec$rhs)
      draw_sem_edge(
        x0 = identity_to_outcome_spec$x0,
        y0 = identity_to_outcome_spec$y0,
        x1 = identity_to_outcome_spec$x1,
        y1 = identity_to_outcome_spec$y1,
        label = fmt_std(identity_to_outcome_est$std),
        label_x = identity_to_outcome_spec$label_x,
        label_y = identity_to_outcome_spec$label_y,
        style = list(col = "black", lty = 1, lwd = 1.8 + 2.2 * abs(identity_to_outcome_est$std)),
        label_cex = 0.98
      )

      # Draw measurement arrows and loadings
      for (i in seq_len(nrow(measurement_specs))) {
        est <- extract_standardized_path(fit, "STEMIdentity", "=~", measurement_specs$rhs[i])
        draw_sem_edge(
          x0 = measurement_specs$x0[i],
          y0 = measurement_specs$y0[i],
          x1 = measurement_specs$x1[i],
          y1 = measurement_specs$y1[i],
          label = fmt_std(est$std),
          label_x = measurement_specs$label_x[i],
          label_y = measurement_specs$label_y[i],
          style = list(col = "black", lty = 1, lwd = 1.3 + 1.5 * abs(est$std)),
          label_cex = 0.90
        )
      }

      graphics::text(
        x = 0.50, y = 0.965,
        labels = title_text,
        cex = 1.55,
        font = 2,
        xpd = NA
      )

      graphics::text(
        x = 0.50, y = 0.045,
        labels = paste0(
          "Standardised coefficients are shown for focal significant paths only.\n",
          "The model also adjusts for gender, ethnicity, parental higher education, school type, area deprivation,\n",
          "and prior science preparation; control coefficients and non-significant focal paths are omitted for clarity."
        ),
        cex = 0.86,
        xpd = NA
      )

      graphics::par(old_par)
      grDevices::dev.off()
    },
    error = function(e) {
      try(
        {
          if (grDevices::dev.cur() > 1) {
            grDevices::dev.off()
          }
        },
        silent = TRUE
      )

      grDevices::png(file_png, width = 2600, height = 1600, res = 220, bg = "white")
      graphics::plot.new()
      graphics::text(0.5, 0.58, "Hybrid SEM figure could not be rendered", cex = 1.2, font = 2)
      graphics::text(0.5, 0.46, e$message, cex = 0.9)
      grDevices::dev.off()
    }
  )
}
```

# 4. Import data

```{r import-data}
raw_df <- haven::read_dta(data_file) |>
  dplyr::mutate(dplyr::across(dplyr::everything(), recode_user_missing))

if (labels_available) {
  var_labels <- readr::read_csv(labels_file, show_col_types = FALSE) |>
    dplyr::rename(variable = Variable, label = Label)
} else {
  var_labels <- tibble::tibble(variable = character(), label = character())
}

tibble::tibble(
  object = c("raw_df", "var_labels"),
  rows = c(nrow(raw_df), nrow(var_labels)),
  columns = c(ncol(raw_df), ncol(var_labels))
) |>
  knitr::kable(caption = "Imported data objects")
```

# 5. Build population and recode variables

```{r build-analysis-data}
population_df <- build_analysis_sample(
  df = raw_df,
  population = analysis_population,
  stem_def = stem_definition
)

analysis_df <- population_df |>
  dplyr::mutate(
    weight = Weight4 / mean(Weight4, na.rm = TRUE),

    # Science Capital ingredients retained for the benchmark and controlled specifications
    scicap2_r = reverse_ordinal(SCICAP2, 1, 5),
    qtalk_r = recode_qtalk(QTALK),
    qinformed_r = reverse_ordinal(QINFORMEDA, 1, 4, extra_missing = c(5)),

    visit1_r = recode_visit_freq(VISIT1),
    visit2_r = recode_visit_freq(VISIT2),
    visit3_r = recode_visit_freq(VISIT3),
    visit4_r = recode_visit_freq(VISIT4),
    qvisit2_9_r = recode_visit_freq(QVISIT2_9),
    visit6_r = recode_visit_freq(VISIT6),

    science_network_nonmiss = rowSums(!is.na(cbind(WHOSC_SIB, WHOSC_PAR, WHOSC_EXT, WHOSC_FRI, WHOSC_SP, WHOSC_COM, WHOSC_OTH))),
    science_network_count = dplyr::case_when(
      FAM_SCI == 2 ~ 0,
      FAM_SCI == 1 & science_network_nonmiss > 0 ~ rowSums(cbind(WHOSC_SIB, WHOSC_PAR, WHOSC_EXT, WHOSC_FRI, WHOSC_SP, WHOSC_COM, WHOSC_OTH), na.rm = TRUE),
      FAM_SCI == 1 & science_network_nonmiss == 0 ~ NA_real_,
      TRUE ~ NA_real_
    ),

    # Diagnostic / excluded overlap-prone indicators retained for transparency
    fam_sci_r = recode_yes_no(FAM_SCI),
    schsci_enjy_r = reverse_ordinal(SCHSCI_ENJY, 1, 5, extra_missing = 6),
    qagree6_1_r = reverse_ordinal(QAGREE6_1, 1, 5, extra_missing = 6),
    qagree6_2_r = reverse_ordinal(QAGREE6_2, 1, 5, extra_missing = 6),

    # STEM Identity indicators
    sciid1_r = reverse_ordinal(SCIID1, 1, 5),
    sciid2_r = reverse_ordinal(SCIID2, 1, 5),
    epr13_r = reverse_ordinal(EPR13, 1, 5),

    # Economic variables
    epr5_r = reverse_ordinal(EPR5, 1, 5),
    scigen_r = reverse_ordinal(SCI_GEN, 1, 5, extra_missing = c(6)),
    fjob12_r = reverse_ordinal(FJOB12, 1, 4),
    view2_r = reverse_ordinal(VIEW2, 1, 5),
    app_res4_r = reverse_ordinal(APP_RES4, 1, 5),

    # Background controls
    gender_woman = dplyr::case_when(
      GENDER == 1 ~ 1,
      GENDER %in% c(2, 3, 4, 5) ~ 0,
      TRUE ~ NA_real_
    ),
    gender_diverse = dplyr::case_when(
      GENDER %in% c(3, 4, 5) ~ 1,
      GENDER %in% c(1, 2) ~ 0,
      TRUE ~ NA_real_
    ),
    eth_minoritised = dplyr::case_when(
      ETH == 3 ~ 0,
      ETH %in% c(1, 2, 4, 5, 6) ~ 1,
      TRUE ~ NA_real_
    ),
    parent_uni_any = dplyr::case_when(
      PAR1UNI == 1 | PAR2UNI == 1 ~ 1,
      PAR1UNI %in% c(2, 3) | PAR2UNI %in% c(2, 3) ~ 0,
      TRUE ~ NA_real_
    ),
    school_selective = dplyr::case_when(
      SC_TYPE == 2 ~ 1,
      SC_TYPE %in% c(1, 3) ~ 0,
      TRUE ~ NA_real_
    ),
    school_private = dplyr::case_when(
      SC_TYPE == 3 ~ 1,
      SC_TYPE %in% c(1, 2) ~ 0,
      TRUE ~ NA_real_
    ),
    high_sci_gcse = dplyr::case_when(
      SCI_ATT_GCSE == 1 ~ 1,
      SCI_ATT_GCSE %in% c(2, 3, 4, 5, 6, 7) ~ 0,
      TRUE ~ NA_real_
    ),
    high_math_gcse = dplyr::case_when(
      MATH_ATT_GCSE == 1 ~ 1,
      MATH_ATT_GCSE %in% c(2, 3, 4, 5, 6, 7) ~ 0,
      TRUE ~ NA_real_
    ),
    triple_science = dplyr::case_when(
      SCI_GCSE == 2 ~ 1,
      SCI_GCSE %in% c(1, 3, 4, 5) ~ 0,
      TRUE ~ NA_real_
    ),
    top_science_set = dplyr::case_when(
      SCISET == 1 ~ 1,
      SCISET %in% c(2, 3, 4) ~ 0,
      TRUE ~ NA_real_
    )
  ) |>
  dplyr::mutate(
    science_visit_index = row_mean_min_n(
      data.frame(visit1_r, visit2_r, visit3_r, visit4_r, qvisit2_9_r, visit6_r),
      min_nonmiss = 3
    ),
    post16_stem_any = row_any_yes(
      data.frame(
        SUB_AL_4, SUB_AL_6, SUB_AL_9, SUB_AL_10, SUB_AL_19, SUB_AL_25, SUB_AL_30, SUB_AL_32,
        SUB_BTEC_2, SUB_BTEC_4, SUB_BTEC_6, SUB_BTEC_9, SUB_BTEC_10, SUB_BTEC_25, SUB_BTEC_30
      ),
      min_nonmiss = 1
    ),

    scicap2_z = safe_scale(scicap2_r),
    qtalk_z = safe_scale(qtalk_r),
    qinformed_z = safe_scale(qinformed_r),
    science_network_count_z = safe_scale(science_network_count),
    science_visit_index_z = safe_scale(science_visit_index),

    scicap_comp_z = row_mean_min_n(
      data.frame(scicap2_z, qtalk_z, qinformed_z, science_network_count_z, science_visit_index_z),
      min_nonmiss = 3
    ),

    stem_identity_comp = row_mean_min_n(
      data.frame(sciid1_r, sciid2_r, epr13_r),
      min_nonmiss = 2
    ),

    epr5_z = safe_scale(epr5_r),
    scigen_z = safe_scale(scigen_r),
    fjob12_z = safe_scale(fjob12_r),
    view2_z = safe_scale(view2_r),

    high_sci_gcse_z = safe_scale(high_sci_gcse),
    high_math_gcse_z = safe_scale(high_math_gcse),
    triple_science_z = safe_scale(triple_science),
    top_science_set_z = safe_scale(top_science_set),
    post16_stem_any_z = safe_scale(post16_stem_any),
    prior_science_prep_z = row_mean_min_n(
      data.frame(high_sci_gcse_z, high_math_gcse_z, triple_science_z, top_science_set_z, post16_stem_any_z),
      min_nonmiss = 3
    ),
    imd_quintile_z = safe_scale(IMDquintile)
  )

control_vars <- c(
  "gender_woman", "gender_diverse", "eth_minoritised", "parent_uni_any",
  "school_selective", "school_private", "imd_quintile_z", "prior_science_prep_z"
)

analysis_var_labels <- c(
  scicap2_r = "Scientific evidence argument (reversed)",
  qtalk_r = "Talks about science (higher = more frequent)",
  qinformed_r = "Feels informed about science (reversed)",
  science_network_count = "Breadth of known STEM contacts",
  science_visit_index = "Science exposure visit index",
  scicap_comp_z = "Composite Science Capital index (z-based average)",
  sciid1_r = "I see myself as a science person (reversed)",
  sciid2_r = "Other people see me as a science person (reversed)",
  epr13_r = "I am good at science (reversed)",
  stem_identity_comp = "Composite STEM identity score",
  epr5_r = "Science qualification -> many jobs (reversed)",
  epr5_z = "Science qualification -> many jobs (standardised)",
  scigen_r = "Science/technology -> more jobs (reversed)",
  scigen_z = "Science/technology -> more jobs (standardised)",
  fjob12_r = "To earn a lot of money matters (reversed)",
  fjob12_z = "To earn a lot of money matters (standardised)",
  view2_r = "Scientists/people using science earn a lot of money (reversed)",
  view2_z = "Scientists/people using science earn a lot of money (standardised)",
  gender_woman = "Woman (ref. man)",
  gender_diverse = "Gender diverse / prefer not to say",
  eth_minoritised = "Minoritised ethnicity",
  parent_uni_any = "At least one parent attended university",
  school_selective = "Selective school",
  school_private = "Private school",
  high_sci_gcse = "High science GCSE attainment",
  high_math_gcse = "High maths GCSE attainment",
  triple_science = "Triple science at GCSE",
  top_science_set = "Top set for science in Year 11",
  post16_stem_any = "Any post-16 STEM subject",
  prior_science_prep_z = "Prior science preparation composite (z-based average)",
  imd_quintile_z = "IMD quintile (standardised)",
  stem_outcome = "Core STEM undergrad vs non-STEM undergrad"
)

tibble::tibble(
  stage = c("Raw sample", "Filtered analytic population"),
  n = c(nrow(raw_df), nrow(analysis_df))
) |>
  knitr::kable(caption = "Sample size after population and outcome filtering")
```

# 6. Diagnostics: attrition, missingness, and sample comparison

```{r diagnostics}
benchmark_sem_vars <- c(
  "scicap_comp_z",
  "sciid1_r", "sciid2_r", "epr13_r",
  "epr5_z", "scigen_z", "fjob12_z",
  "stem_outcome", "weight"
)

controlled_sem_vars <- c(
  benchmark_sem_vars,
  control_vars
)

latent_sem_vars <- c(
  "scicap2_r", "qtalk_r", "qinformed_r", "science_network_count_z", "science_visit_index_z",
  "sciid1_r", "sciid2_r", "epr13_r",
  "epr5_z", "scigen_z", "fjob12_z",
  control_vars,
  "stem_outcome", "weight"
)

logit_vars <- c(
  "scicap_comp_z", "stem_identity_comp",
  "epr5_z", "scigen_z", "fjob12_z", "view2_z",
  control_vars,
  "stem_outcome", "weight"
)

benchmark_sem_df <- analysis_df |>
  dplyr::select(dplyr::all_of(benchmark_sem_vars))

controlled_sem_df <- analysis_df |>
  dplyr::select(dplyr::all_of(controlled_sem_vars))

latent_sem_df <- analysis_df |>
  dplyr::select(dplyr::all_of(latent_sem_vars))

benchmark_sem_cc <- benchmark_sem_df |>
  tidyr::drop_na()

controlled_sem_cc <- controlled_sem_df |>
  tidyr::drop_na()

latent_sem_cc <- latent_sem_df |>
  tidyr::drop_na()

logit_df <- analysis_df |>
  dplyr::select(dplyr::all_of(logit_vars)) |>
  tidyr::drop_na()

attrition_tbl <- tibble::tribble(
  ~stage, ~n,
  "Raw ASPIRES 3 sample", nrow(raw_df),
  "After population filter and outcome definition", nrow(analysis_df),
  "Benchmark SEM available cases (pairwise WLSMV)", nrow(benchmark_sem_df),
  "Controlled SEM available cases (pairwise WLSMV)", nrow(controlled_sem_df),
  "Benchmark SEM complete-case equivalent", nrow(benchmark_sem_cc),
  "Controlled SEM complete-case equivalent", nrow(controlled_sem_cc),
  "Controlled latent sensitivity complete-case equivalent", nrow(latent_sem_cc),
  "Controlled logit common sample", nrow(logit_df)
)

missingness_tbl <- analysis_df |>
  dplyr::select(
    dplyr::all_of(c(
      "scicap2_r", "qtalk_r", "qinformed_r", "science_network_count", "science_visit_index",
      "scicap_comp_z", "sciid1_r", "sciid2_r", "epr13_r", "stem_identity_comp",
      "epr5_r", "scigen_r", "fjob12_r", "view2_r",
      control_vars,
      "high_sci_gcse", "high_math_gcse", "triple_science", "top_science_set", "post16_stem_any",
      "prior_science_prep_z", "stem_outcome", "weight"
    ))
  ) |>
  dplyr::summarise(
    dplyr::across(
      dplyr::everything(),
      ~ mean(is.na(.x))
    )
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::everything(),
    names_to = "variable",
    values_to = "prop_missing"
  ) |>
  dplyr::mutate(label = dplyr::if_else(variable %in% names(analysis_var_labels), unname(analysis_var_labels[variable]), variable)) |>
  dplyr::arrange(dplyr::desc(prop_missing))

control_design_note <- tibble::tribble(
  ~decision_point, ~rationale,
  "Add background controls to both STEM Identity and STEM outcome", "Reduces omitted-variable bias and makes the SEM less purely descriptive.",
  "Use a prior science preparation composite", "Summarises science track, attainment, and post-16 STEM exposure without forcing a weak reflective latent block.",
  "Use WLSMV with pairwise missing data plus fixed.x = FALSE and conditional.x = FALSE", "Models exogenous predictors jointly within the categorical SEM and avoids unnecessary exogenous-variable listwise deletion.",
  "Do not include university type as a primary control", "University type is contemporaneous with degree field and is potentially downstream of the subject-choice process in this same-wave design."
)

analysis_df$in_logit_sample <- as.integer(stats::complete.cases(analysis_df[, logit_vars]))

sample_compare_tbl <- make_sample_compare(
  data = analysis_df,
  include_var = "in_logit_sample",
  vars = c(
    "stem_outcome", "scicap_comp_z", "stem_identity_comp", "epr5_z", "scigen_z", "fjob12_z",
    control_vars
  ),
  weight_var = "weight"
) |>
  dplyr::mutate(label = dplyr::if_else(variable %in% names(analysis_var_labels), unname(analysis_var_labels[variable]), variable)) |>
  dplyr::select(variable, label, overall, included, excluded, abs_diff, prop_missing)

write_csv_safely(attrition_tbl, file.path(out_dir, "01_attrition.csv"))
write_csv_safely(missingness_tbl, file.path(out_dir, "02_missingness.csv"))
write_csv_safely(control_design_note, file.path(out_dir, "03_control_design_note.csv"))
write_csv_safely(sample_compare_tbl, file.path(out_dir, "04_logit_sample_comparison.csv"))

attrition_tbl |>
  knitr::kable(caption = "Attrition and analysis-sample summary")
```

```{r missingness-top}
head(missingness_tbl, 18) |>
  knitr::kable(digits = 3, caption = "Top missingness proportions for revised variables")
```

```{r sample-comparison}
sample_compare_tbl |>
  knitr::kable(digits = 3, caption = "Weighted comparison of the full analytic population and the controlled-logit common sample")
```

# 7. Descriptive statistics and composite diagnostics

```{r descriptives}
descriptive_vars <- c(
  "scicap2_r", "qtalk_r", "qinformed_r", "science_network_count", "science_visit_index", "scicap_comp_z",
  "sciid1_r", "sciid2_r", "epr13_r", "stem_identity_comp",
  "epr5_r", "scigen_r", "fjob12_r", "view2_r",
  "gender_woman", "gender_diverse", "eth_minoritised", "parent_uni_any",
  "school_selective", "school_private", "imd_quintile_z", "prior_science_prep_z",
  "stem_outcome"
)

descriptive_tbl <- purrr::map_dfr(
  descriptive_vars,
  function(v) {
    x <- analysis_df[[v]]
    tibble::tibble(
      variable = v,
      label = if (v %in% names(analysis_var_labels)) unname(analysis_var_labels[v]) else v,
      n = sum(!is.na(x)),
      mean = mean(x, na.rm = TRUE),
      sd = stats::sd(x, na.rm = TRUE),
      min = min(x, na.rm = TRUE),
      max = max(x, na.rm = TRUE)
    )
  }
)

science_component_corr_tbl <- analysis_df |>
  dplyr::select(scicap2_r, qtalk_r, qinformed_r, science_network_count, science_visit_index, scicap_comp_z, stem_identity_comp) |>
  stats::cor(use = "pairwise.complete.obs") |>
  round(3) |>
  as.data.frame() |>
  tibble::rownames_to_column("variable")

prior_prep_corr_tbl <- analysis_df |>
  dplyr::select(high_sci_gcse, high_math_gcse, triple_science, top_science_set, post16_stem_any, prior_science_prep_z) |>
  stats::cor(use = "pairwise.complete.obs") |>
  round(3) |>
  as.data.frame() |>
  tibble::rownames_to_column("variable")

write_csv_safely(descriptive_tbl, file.path(out_dir, "05_descriptives.csv"))
write_csv_safely(science_component_corr_tbl, file.path(out_dir, "06_science_component_correlations.csv"))
write_csv_safely(prior_prep_corr_tbl, file.path(out_dir, "07_prior_prep_correlations.csv"))

descriptive_tbl |>
  knitr::kable(digits = 3, caption = "Descriptive statistics for revised variables")
```

```{r component-correlations}
science_component_corr_tbl |>
  knitr::kable(caption = "Correlations among Science Capital ingredients and STEM identity")
```

```{r prior-prep-correlations}
prior_prep_corr_tbl |>
  knitr::kable(caption = "Correlations among prior science preparation indicators")
```

# 8. Assumption screening

```{r vif}
vif_screen_df <- analysis_df |>
  dplyr::select(
    stem_identity_comp,
    scicap_comp_z, epr5_z, scigen_z, fjob12_z,
    dplyr::all_of(control_vars)
  ) |>
  tidyr::drop_na()

vif_model <- stats::lm(
  stem_identity_comp ~ scicap_comp_z + epr5_z + scigen_z + fjob12_z +
    gender_woman + gender_diverse + eth_minoritised + parent_uni_any +
    school_selective + school_private + imd_quintile_z + prior_science_prep_z,
  data = vif_screen_df
)

vif_tbl <- tibble::tibble(
  term = names(car::vif(vif_model)),
  vif = as.numeric(car::vif(vif_model))
)

write_csv_safely(vif_tbl, file.path(out_dir, "08_vif_screening.csv"))

vif_tbl |>
  knitr::kable(digits = 3, caption = "Variance inflation factors for the controlled predictor set")
```

# 9. Measurement checks

## 9.1 STEM Identity CFA

```{r identity-cfa}
identity_model <- '
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r
'

identity_items <- c("sciid1_r", "sciid2_r", "epr13_r")

identity_args <- list(
  model = identity_model,
  data = benchmark_sem_df,
  ordered = identity_items,
  estimator = sem_estimator,
  missing = sem_missing,
  std.lv = TRUE
)

if (use_sampling_weights) {
  identity_args$sampling.weights <- weight_var
}

fit_identity_cfa <- do.call(lavaan::cfa, identity_args)

identity_fit_tbl <- extract_fit_table(fit_identity_cfa)
identity_loading_tbl <- extract_std_loadings(fit_identity_cfa)
identity_rel_tbl <- safe_reliability_tbl(fit_identity_cfa)

write_csv_safely(identity_fit_tbl, file.path(out_dir, "09_identity_cfa_fit.csv"))
write_csv_safely(identity_loading_tbl, file.path(out_dir, "10_identity_cfa_loadings.csv"))
write_csv_safely(identity_rel_tbl, file.path(out_dir, "11_identity_cfa_reliability.csv"))
```

```{r identity-cfa-fit}
identity_fit_tbl |>
  knitr::kable(digits = 3, caption = "STEM Identity CFA fit indices")
```

```{r identity-cfa-loadings}
identity_loading_tbl |>
  knitr::kable(digits = 3, caption = "STEM Identity CFA loadings")
```

## 9.2 Enriched two-factor CFA (measurement sensitivity)

```{r twofactor-cfa}
twofactor_model <- '
  ScienceCapitalLat =~ scicap2_r + qtalk_r + qinformed_r + science_network_count_z + science_visit_index_z
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r
'

twofactor_data_vars <- c(
  "scicap2_r", "qtalk_r", "qinformed_r", "science_network_count_z", "science_visit_index_z",
  "sciid1_r", "sciid2_r", "epr13_r"
)

twofactor_ordered <- c("scicap2_r", "qtalk_r", "qinformed_r", "sciid1_r", "sciid2_r", "epr13_r")

twofactor_args <- list(
  model = twofactor_model,
  data = latent_sem_df,
  ordered = twofactor_ordered,
  estimator = sem_estimator,
  missing = sem_missing,
  std.lv = TRUE
)

if (use_sampling_weights) {
  twofactor_args$sampling.weights <- weight_var
}

fit_twofactor_cfa <- do.call(lavaan::cfa, twofactor_args)

twofactor_fit_tbl <- extract_fit_table(fit_twofactor_cfa)
twofactor_loading_tbl <- extract_std_loadings(fit_twofactor_cfa)
twofactor_rel_tbl <- safe_reliability_tbl(fit_twofactor_cfa)

twofactor_htmt_tbl <- safe_htmt_tbl(
  model_syntax = twofactor_model,
  data = latent_sem_df |>
    dplyr::select(dplyr::all_of(twofactor_data_vars)),
  ordered = twofactor_ordered
)

twofactor_weak_loading_tbl <- twofactor_loading_tbl |>
  dplyr::filter(std_loading < 0.40)

write_csv_safely(twofactor_fit_tbl, file.path(out_dir, "12_twofactor_cfa_fit.csv"))
write_csv_safely(twofactor_loading_tbl, file.path(out_dir, "13_twofactor_cfa_loadings.csv"))
write_csv_safely(twofactor_rel_tbl, file.path(out_dir, "14_twofactor_cfa_reliability.csv"))
write_csv_safely(twofactor_htmt_tbl, file.path(out_dir, "15_twofactor_cfa_htmt.csv"))
write_csv_safely(twofactor_weak_loading_tbl, file.path(out_dir, "16_twofactor_cfa_weak_loadings.csv"))
```

```{r twofactor-cfa-fit}
twofactor_fit_tbl |>
  knitr::kable(digits = 3, caption = "Enriched two-factor CFA fit indices")
```

```{r twofactor-cfa-loadings}
twofactor_loading_tbl |>
  knitr::kable(digits = 3, caption = "Enriched two-factor CFA loadings")
```

# 10. Benchmark SEM: theory-led model (pairwise WLSMV)

```{r benchmark-sem}
benchmark_sem_model <- '
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r

  STEMIdentity ~ a1*scicap_comp_z + a2*epr5_z + a3*scigen_z + a4*fjob12_z
  stem_outcome ~ b1*STEMIdentity + c1*scicap_comp_z + c2*epr5_z + c3*scigen_z + c4*fjob12_z

  ind_scicap := a1*b1
  ind_epr5 := a2*b1
  ind_scigen := a3*b1
  ind_fjob12 := a4*b1

  total_scicap := c1 + ind_scicap
  total_epr5 := c2 + ind_epr5
  total_scigen := c3 + ind_scigen
  total_fjob12 := c4 + ind_fjob12
'

benchmark_sem_args <- list(
  model = benchmark_sem_model,
  data = benchmark_sem_df,
  ordered = c("sciid1_r", "sciid2_r", "epr13_r", "stem_outcome"),
  estimator = sem_estimator,
  missing = sem_missing,
  fixed.x = FALSE,
  conditional.x = FALSE,
  std.lv = TRUE
)

if (use_sampling_weights) {
  benchmark_sem_args$sampling.weights <- weight_var
}

fit_benchmark_sem <- do.call(lavaan::sem, benchmark_sem_args)

benchmark_fit_tbl <- extract_fit_table(fit_benchmark_sem)
benchmark_paths_tbl <- extract_std_paths(fit_benchmark_sem)
benchmark_rsq_tbl <- tibble::enframe(lavaan::inspect(fit_benchmark_sem, "r2"), name = "variable", value = "r2")
benchmark_mod_tbl <- tryCatch(
  {
    lavaan::modindices(fit_benchmark_sem, sort. = TRUE) |>
      tibble::as_tibble() |>
      dplyr::filter(mi >= 10) |>
      dplyr::slice_head(n = 20)
  },
  error = function(e) {
    tibble::tibble(message = e$message)
  }
)

write_csv_safely(benchmark_fit_tbl, file.path(out_dir, "17_benchmark_sem_fit.csv"))
write_csv_safely(benchmark_paths_tbl, file.path(out_dir, "18_benchmark_sem_paths.csv"))
write_csv_safely(benchmark_rsq_tbl, file.path(out_dir, "19_benchmark_sem_r2.csv"))
write_csv_safely(benchmark_mod_tbl, file.path(out_dir, "20_benchmark_sem_modindices_top20.csv"))

make_smartpls_semplot(
  fit = fit_benchmark_sem,
  file_png = file.path(out_dir, "21_benchmark_sem_smartpls_style.png"),
  title_text = "Benchmark theory-led SEM"
)
```

```{r benchmark-sem-fit}
benchmark_fit_tbl |>
  knitr::kable(digits = 3, caption = "Benchmark theory-led SEM fit indices")
```

```{r benchmark-sem-r2}
benchmark_rsq_tbl |>
  knitr::kable(digits = 3, caption = "Benchmark SEM R² values")
```

```{r benchmark-sem-paths}
benchmark_paths_tbl |>
  knitr::kable(digits = 3, caption = "Benchmark SEM standardised paths and effects")
```

```{r benchmark-sem-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "21_benchmark_sem_smartpls_style.png"))
```

# 11. Controlled primary SEM

```{r controlled-primary-sem}
controlled_primary_sem_model <- '
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r

  STEMIdentity ~ a1*scicap_comp_z + a2*epr5_z + a3*scigen_z + a4*fjob12_z +
                 d1*gender_woman + d2*gender_diverse + d3*eth_minoritised + d4*parent_uni_any +
                 d5*school_selective + d6*school_private + d7*imd_quintile_z + d8*prior_science_prep_z

  stem_outcome ~ b1*STEMIdentity + c1*scicap_comp_z + c2*epr5_z + c3*scigen_z + c4*fjob12_z +
                 e1*gender_woman + e2*gender_diverse + e3*eth_minoritised + e4*parent_uni_any +
                 e5*school_selective + e6*school_private + e7*imd_quintile_z + e8*prior_science_prep_z

  ind_scicap := a1*b1
  ind_epr5 := a2*b1
  ind_scigen := a3*b1
  ind_fjob12 := a4*b1

  total_scicap := c1 + ind_scicap
  total_epr5 := c2 + ind_epr5
  total_scigen := c3 + ind_scigen
  total_fjob12 := c4 + ind_fjob12
'

controlled_primary_args <- list(
  model = controlled_primary_sem_model,
  data = controlled_sem_df,
  ordered = c("sciid1_r", "sciid2_r", "epr13_r", "stem_outcome"),
  estimator = sem_estimator,
  missing = sem_missing,
  fixed.x = FALSE,
  conditional.x = FALSE,
  std.lv = TRUE
)

if (use_sampling_weights) {
  controlled_primary_args$sampling.weights <- weight_var
}

fit_controlled_primary <- do.call(lavaan::sem, controlled_primary_args)

controlled_primary_fit_tbl <- extract_fit_table(fit_controlled_primary)
controlled_primary_paths_tbl <- extract_std_paths(fit_controlled_primary)
controlled_primary_rsq_tbl <- tibble::enframe(lavaan::inspect(fit_controlled_primary, "r2"), name = "variable", value = "r2")
controlled_primary_mod_tbl <- tryCatch(
  {
    lavaan::modindices(fit_controlled_primary, sort. = TRUE) |>
      tibble::as_tibble() |>
      dplyr::filter(mi >= 10) |>
      dplyr::slice_head(n = 20)
  },
  error = function(e) {
    tibble::tibble(message = e$message)
  }
)

write_csv_safely(controlled_primary_fit_tbl, file.path(out_dir, "22_controlled_primary_sem_fit.csv"))
write_csv_safely(controlled_primary_paths_tbl, file.path(out_dir, "23_controlled_primary_sem_paths.csv"))
write_csv_safely(controlled_primary_rsq_tbl, file.path(out_dir, "24_controlled_primary_sem_r2.csv"))
write_csv_safely(controlled_primary_mod_tbl, file.path(out_dir, "25_controlled_primary_sem_modindices_top20.csv"))

make_clean_primary_semplot(
  fit = fit_controlled_primary,
  file_png = file.path(out_dir, "26_controlled_primary_sem_clean.png"),
  title_text = "Controlled primary SEM"
)
```

```{r controlled-primary-sem-fit}
controlled_primary_fit_tbl |>
  knitr::kable(digits = 3, caption = "Controlled primary SEM fit indices")
```

```{r controlled-primary-sem-r2}
controlled_primary_rsq_tbl |>
  knitr::kable(digits = 3, caption = "Controlled primary SEM R² values")
```

```{r controlled-primary-sem-paths}
controlled_primary_paths_tbl |>
  knitr::kable(digits = 3, caption = "Controlled primary SEM standardised paths and effects")
```

```{r controlled-primary-sem-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "26_controlled_primary_sem_clean.png"))
```

# 12. Controlled comparator SEM

```{r controlled-comparator-sem}
controlled_comparator_sem_model <- '
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r

  STEMIdentity ~ d1*gender_woman + d2*gender_diverse + d3*eth_minoritised + d4*parent_uni_any +
                 d5*school_selective + d6*school_private + d7*imd_quintile_z + d8*prior_science_prep_z

  stem_outcome ~ b1*STEMIdentity + c1*scicap_comp_z + c2*epr5_z + c3*scigen_z + c4*fjob12_z +
                 e1*gender_woman + e2*gender_diverse + e3*eth_minoritised + e4*parent_uni_any +
                 e5*school_selective + e6*school_private + e7*imd_quintile_z + e8*prior_science_prep_z

  STEMIdentity ~~ scicap_comp_z + epr5_z + scigen_z + fjob12_z
'

controlled_comparator_args <- list(
  model = controlled_comparator_sem_model,
  data = controlled_sem_df,
  ordered = c("sciid1_r", "sciid2_r", "epr13_r", "stem_outcome"),
  estimator = sem_estimator,
  missing = sem_missing,
  fixed.x = FALSE,
  conditional.x = FALSE,
  std.lv = TRUE
)

if (use_sampling_weights) {
  controlled_comparator_args$sampling.weights <- weight_var
}

fit_controlled_comparator <- do.call(lavaan::sem, controlled_comparator_args)

controlled_comparator_fit_tbl <- extract_fit_table(fit_controlled_comparator)
controlled_comparator_paths_tbl <- extract_std_paths(fit_controlled_comparator, include_cov = TRUE)
controlled_comparator_rsq_tbl <- tibble::enframe(lavaan::inspect(fit_controlled_comparator, "r2"), name = "variable", value = "r2")

write_csv_safely(controlled_comparator_fit_tbl, file.path(out_dir, "27_controlled_comparator_sem_fit.csv"))
write_csv_safely(controlled_comparator_paths_tbl, file.path(out_dir, "28_controlled_comparator_sem_paths.csv"))
write_csv_safely(controlled_comparator_rsq_tbl, file.path(out_dir, "29_controlled_comparator_sem_r2.csv"))

make_smartpls_semplot(
  fit = fit_controlled_comparator,
  file_png = file.path(out_dir, "30_controlled_comparator_sem_smartpls_style.png"),
  title_text = "Controlled comparator SEM"
)
```

```{r controlled-comparator-sem-fit}
controlled_comparator_fit_tbl |>
  knitr::kable(digits = 3, caption = "Controlled comparator SEM fit indices")
```

```{r controlled-comparator-sem-r2}
controlled_comparator_rsq_tbl |>
  knitr::kable(digits = 3, caption = "Controlled comparator SEM R² values")
```

```{r controlled-comparator-sem-paths}
controlled_comparator_paths_tbl |>
  knitr::kable(digits = 3, caption = "Controlled comparator SEM paths and selected covariances")
```

```{r controlled-comparator-sem-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "30_controlled_comparator_sem_smartpls_style.png"))
```

# 13. Controlled latent sensitivity SEM

```{r controlled-latent-sensitivity-sem}
controlled_latent_sem_model <- '
  ScienceCapitalLat =~ scicap2_r + qtalk_r + qinformed_r + science_network_count_z + science_visit_index_z
  STEMIdentity =~ sciid1_r + sciid2_r + epr13_r

  STEMIdentity ~ a1*ScienceCapitalLat + a2*epr5_z + a3*scigen_z + a4*fjob12_z +
                 d1*gender_woman + d2*gender_diverse + d3*eth_minoritised + d4*parent_uni_any +
                 d5*school_selective + d6*school_private + d7*imd_quintile_z + d8*prior_science_prep_z

  stem_outcome ~ b1*STEMIdentity + c1*ScienceCapitalLat + c2*epr5_z + c3*scigen_z + c4*fjob12_z +
                 e1*gender_woman + e2*gender_diverse + e3*eth_minoritised + e4*parent_uni_any +
                 e5*school_selective + e6*school_private + e7*imd_quintile_z + e8*prior_science_prep_z

  ind_scicap := a1*b1
  ind_epr5 := a2*b1
  ind_scigen := a3*b1
  ind_fjob12 := a4*b1

  total_scicap := c1 + ind_scicap
  total_epr5 := c2 + ind_epr5
  total_scigen := c3 + ind_scigen
  total_fjob12 := c4 + ind_fjob12
'

controlled_latent_args <- list(
  model = controlled_latent_sem_model,
  data = latent_sem_df,
  ordered = c("scicap2_r", "qtalk_r", "qinformed_r", "sciid1_r", "sciid2_r", "epr13_r", "stem_outcome"),
  estimator = sem_estimator,
  missing = sem_missing,
  fixed.x = FALSE,
  conditional.x = FALSE,
  std.lv = TRUE
)

if (use_sampling_weights) {
  controlled_latent_args$sampling.weights <- weight_var
}

fit_controlled_latent <- do.call(lavaan::sem, controlled_latent_args)

controlled_latent_fit_tbl <- extract_fit_table(fit_controlled_latent)
controlled_latent_loadings_tbl <- extract_std_loadings(fit_controlled_latent)
controlled_latent_paths_tbl <- extract_std_paths(fit_controlled_latent)
controlled_latent_rsq_tbl <- tibble::enframe(lavaan::inspect(fit_controlled_latent, "r2"), name = "variable", value = "r2")

write_csv_safely(controlled_latent_fit_tbl, file.path(out_dir, "31_controlled_latent_sem_fit.csv"))
write_csv_safely(controlled_latent_loadings_tbl, file.path(out_dir, "32_controlled_latent_sem_loadings.csv"))
write_csv_safely(controlled_latent_paths_tbl, file.path(out_dir, "33_controlled_latent_sem_paths.csv"))
write_csv_safely(controlled_latent_rsq_tbl, file.path(out_dir, "34_controlled_latent_sem_r2.csv"))

make_smartpls_semplot(
  fit = fit_controlled_latent,
  file_png = file.path(out_dir, "35_controlled_latent_sem_smartpls_style.png"),
  title_text = "Controlled latent sensitivity SEM"
)
```

```{r controlled-latent-fit}
controlled_latent_fit_tbl |>
  knitr::kable(digits = 3, caption = "Controlled latent sensitivity SEM fit indices")
```

```{r controlled-latent-r2}
controlled_latent_rsq_tbl |>
  knitr::kable(digits = 3, caption = "Controlled latent sensitivity SEM R² values")
```

```{r controlled-latent-paths}
controlled_latent_paths_tbl |>
  knitr::kable(digits = 3, caption = "Controlled latent sensitivity SEM standardised paths and effects")
```

```{r controlled-latent-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "35_controlled_latent_sem_smartpls_style.png"))
```

# 14. SEM model comparison summary

```{r sem-model-comparison}
sem_model_comp_tbl <- dplyr::bind_rows(
  extract_fit_summary(fit_benchmark_sem, "Benchmark theory-led SEM"),
  extract_fit_summary(fit_controlled_primary, "Controlled primary SEM"),
  extract_fit_summary(fit_controlled_comparator, "Controlled comparator SEM"),
  extract_fit_summary(fit_controlled_latent, "Controlled latent sensitivity SEM")
)

sem_rsq_comp_tbl <- dplyr::bind_rows(
  benchmark_rsq_tbl |>
    dplyr::mutate(model = "Benchmark theory-led SEM"),
  controlled_primary_rsq_tbl |>
    dplyr::mutate(model = "Controlled primary SEM"),
  controlled_comparator_rsq_tbl |>
    dplyr::mutate(model = "Controlled comparator SEM"),
  controlled_latent_rsq_tbl |>
    dplyr::mutate(model = "Controlled latent sensitivity SEM")
) |>
  dplyr::select(model, variable, r2)

write_csv_safely(sem_model_comp_tbl, file.path(out_dir, "36_sem_model_comparison.csv"))
write_csv_safely(sem_rsq_comp_tbl, file.path(out_dir, "37_sem_r2_comparison.csv"))

sem_model_comp_tbl |>
  knitr::kable(digits = 3, caption = "Comparison of benchmark and controlled SEM specifications")
```

# 15. Weighted logistic robustness checks

```{r logistic-robustness}
model_theory <- fit_logit(
  stem_outcome ~ scicap_comp_z + stem_identity_comp,
  data = logit_df,
  use_weights = use_sampling_weights,
  weight_var = weight_var
)

model_theory_econ <- fit_logit(
  stem_outcome ~ scicap_comp_z + stem_identity_comp + epr5_z + scigen_z + fjob12_z,
  data = logit_df,
  use_weights = use_sampling_weights,
  weight_var = weight_var
)

model_controlled_base <- fit_logit(
  stem_outcome ~ scicap_comp_z + stem_identity_comp +
    gender_woman + gender_diverse + eth_minoritised + parent_uni_any +
    school_selective + school_private + imd_quintile_z + prior_science_prep_z,
  data = logit_df,
  use_weights = use_sampling_weights,
  weight_var = weight_var
)

model_controlled_econ <- fit_logit(
  stem_outcome ~ scicap_comp_z + stem_identity_comp + epr5_z + scigen_z + fjob12_z +
    gender_woman + gender_diverse + eth_minoritised + parent_uni_any +
    school_selective + school_private + imd_quintile_z + prior_science_prep_z,
  data = logit_df,
  use_weights = use_sampling_weights,
  weight_var = weight_var
)

model_controlled_view2 <- fit_logit(
  stem_outcome ~ scicap_comp_z + stem_identity_comp + epr5_z + scigen_z + fjob12_z + view2_z +
    gender_woman + gender_diverse + eth_minoritised + parent_uni_any +
    school_selective + school_private + imd_quintile_z + prior_science_prep_z,
  data = logit_df,
  use_weights = use_sampling_weights,
  weight_var = weight_var
)

logit_comp_tbl <- tibble::tibble(
  model = c(
    "Theory only",
    "Theory + economics",
    "Controlled baseline",
    "Controlled + economics",
    "Controlled + economics + VIEW2"
  ),
  N = c(
    stats::nobs(model_theory),
    stats::nobs(model_theory_econ),
    stats::nobs(model_controlled_base),
    stats::nobs(model_controlled_econ),
    stats::nobs(model_controlled_view2)
  ),
  AIC = c(
    AIC(model_theory),
    AIC(model_theory_econ),
    AIC(model_controlled_base),
    AIC(model_controlled_econ),
    AIC(model_controlled_view2)
  ),
  BIC = c(
    BIC(model_theory),
    BIC(model_theory_econ),
    BIC(model_controlled_base),
    BIC(model_controlled_econ),
    BIC(model_controlled_view2)
  ),
  Nagelkerke_R2 = c(
    nagelkerke_r2(model_theory),
    nagelkerke_r2(model_theory_econ),
    nagelkerke_r2(model_controlled_base),
    nagelkerke_r2(model_controlled_econ),
    nagelkerke_r2(model_controlled_view2)
  )
)

roc_theory <- roc_from_logit(model_theory)
roc_theory_econ <- roc_from_logit(model_theory_econ)
roc_controlled_base <- roc_from_logit(model_controlled_base)
roc_controlled_econ <- roc_from_logit(model_controlled_econ)
roc_controlled_view2 <- roc_from_logit(model_controlled_view2)

auc_tbl <- tibble::tibble(
  model = c(
    "Theory only",
    "Theory + economics",
    "Controlled baseline",
    "Controlled + economics",
    "Controlled + economics + VIEW2"
  ),
  AUC = c(
    as.numeric(roc_theory$auc),
    as.numeric(roc_theory_econ$auc),
    as.numeric(roc_controlled_base$auc),
    as.numeric(roc_controlled_econ$auc),
    as.numeric(roc_controlled_view2$auc)
  )
)

logit_coef_tbl <- dplyr::bind_rows(
  or_ci_table(model_theory) |> dplyr::mutate(model = "Theory only"),
  or_ci_table(model_theory_econ) |> dplyr::mutate(model = "Theory + economics"),
  or_ci_table(model_controlled_base) |> dplyr::mutate(model = "Controlled baseline"),
  or_ci_table(model_controlled_econ) |> dplyr::mutate(model = "Controlled + economics"),
  or_ci_table(model_controlled_view2) |> dplyr::mutate(model = "Controlled + economics + VIEW2")
)

write_csv_safely(logit_comp_tbl, file.path(out_dir, "38_logit_model_comparison.csv"))
write_csv_safely(auc_tbl, file.path(out_dir, "39_logit_auc.csv"))
write_csv_safely(logit_coef_tbl, file.path(out_dir, "40_logit_coefficients.csv"))

term_label_map <- c(
  scicap_comp_z = "Science capital composite",
  stem_identity_comp = "STEM identity composite",
  epr5_z = "Science qualification -> many jobs",
  scigen_z = "Science/technology create jobs",
  fjob12_z = "Income orientation",
  view2_z = "Scientists earn a lot",
  gender_woman = "Woman (ref. man)",
  gender_diverse = "Gender diverse / prefer not",
  eth_minoritised = "Minoritised ethnicity",
  parent_uni_any = "Parent(s) attended university",
  school_selective = "Selective school",
  school_private = "Private school",
  imd_quintile_z = "IMD quintile (z)",
  prior_science_prep_z = "Prior science preparation"
)

coef_plot_df <- logit_coef_tbl |>
  dplyr::filter(
    term != "(Intercept)",
    model %in% c("Controlled baseline", "Controlled + economics", "Controlled + economics + VIEW2")
  ) |>
  dplyr::mutate(
    term_label = ifelse(term %in% names(term_label_map), unname(term_label_map[term]), term),
    term_label = factor(term_label, levels = rev(unique(term_label))),
    model = factor(model, levels = c("Controlled baseline", "Controlled + economics", "Controlled + economics + VIEW2"))
  )

grDevices::png(file.path(out_dir, "41_logit_odds_ratios_controlled.png"), width = 2000, height = 1200, res = 200)
ggplot2::ggplot(
  coef_plot_df,
  ggplot2::aes(x = term_label, y = OR, ymin = CI_low, ymax = CI_high, shape = model)
) +
  ggplot2::geom_hline(yintercept = 1, linetype = 2, linewidth = 0.5) +
  ggplot2::geom_pointrange(position = ggplot2::position_dodge(width = 0.5)) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "Controlled weighted logistic robustness checks",
    x = NULL,
    y = "Odds ratio (95% CI)",
    shape = NULL
  ) +
  ggplot2::theme_minimal(base_size = 12)
grDevices::dev.off()

grDevices::png(file.path(out_dir, "42_logit_roc_curves.png"), width = 2000, height = 1200, res = 200)
plot(roc_theory, main = "ROC curves for theory-only and controlled logistic models", lwd = 2)
plot(roc_theory_econ, add = TRUE, lwd = 2, lty = 2)
plot(roc_controlled_base, add = TRUE, lwd = 2, lty = 3)
plot(roc_controlled_econ, add = TRUE, lwd = 2, lty = 4)
plot(roc_controlled_view2, add = TRUE, lwd = 2, lty = 5)
legend(
  "bottomright",
  legend = c(
    sprintf("Theory only (AUC = %.3f)", as.numeric(roc_theory$auc)),
    sprintf("Theory + economics (AUC = %.3f)", as.numeric(roc_theory_econ$auc)),
    sprintf("Controlled baseline (AUC = %.3f)", as.numeric(roc_controlled_base$auc)),
    sprintf("Controlled + economics (AUC = %.3f)", as.numeric(roc_controlled_econ$auc)),
    sprintf("Controlled + economics + VIEW2 (AUC = %.3f)", as.numeric(roc_controlled_view2$auc))
  ),
  lwd = 2,
  lty = c(1, 2, 3, 4, 5),
  bty = "n"
)
grDevices::dev.off()
```

```{r logit-comparison}
logit_comp_tbl |>
  knitr::kable(digits = 3, caption = "Weighted logistic model comparison")
```

```{r logit-auc}
auc_tbl |>
  knitr::kable(digits = 3, caption = "AUC for weighted logistic models")
```

```{r odds-ratio-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "41_logit_odds_ratios_controlled.png"))
```

```{r roc-figure, out.width='100%'}
knitr::include_graphics(file.path(out_dir, "42_logit_roc_curves.png"))
```

# 16. Exported files and console summary

```{r summary}
cat("\n============================================================\n")
cat("ASPIRES 3 controlled SEM script complete.\n")
cat("Outputs written to:", out_dir, "\n")
cat("Benchmark SEM available N (pairwise):", nrow(benchmark_sem_df), "\n")
cat("Controlled SEM available N (pairwise):", nrow(controlled_sem_df), "\n")
cat("Controlled SEM complete-case equivalent:", nrow(controlled_sem_cc), "\n")
cat("Controlled latent SEM complete-case equivalent:", nrow(latent_sem_cc), "\n")
cat("Controlled logit common sample:", nrow(logit_df), "\n")
cat("============================================================\n\n")

cat("SEM comparison summary:\n")
print(sem_model_comp_tbl)

cat("\nControlled primary SEM R²:\n")
print(controlled_primary_rsq_tbl)

cat("\nWeighted logistic model comparison:\n")
print(logit_comp_tbl)
```

# 17. Session information

```{r session-info}
sessionInfo()
```
